suppressPackageStartupMessages({
library(Seurat, quietly = T)
library(HGNChelper, quietly = T)
library(dplyr, quietly = T)
library(openxlsx, quietly = T)
library(reshape2, quietly = T)
library(data.table, quietly = T)
library(ggplot2, quietly = T)
library(ggraph, quietly = T)
library(ggpubr, quietly = T)
library(igraph, quietly = T)
library(RColorBrewer, quietly = T)
})
data_path = '/data3/hratch/norcross_abc/'
sctype_path = '/data2/hratch/Software/'
abc.integrated<-readRDS(file = paste0(data_path, 'interim/abc_integrated.RDS'))
md<-abc.integrated@meta.data
# git clone git@github.com:IanevskiAleksandr/sc-type.git --> commit ID 36e298c49a57846c8de3f8d1ee58f753d3a9a2a0
# load sc-type functions from locally cloned repo (ensures consistency with version)
source(paste0(sctype_path, 'sc-type/', 'R/gene_sets_prepare.R'))
source(paste0(sctype_path, 'sc-type/', 'R/sctype_score_.R'))
db_<-paste0(sctype_path, 'sc-type/', 'ScTypeDB_full.xlsx')
# # load from remote
# source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/gene_sets_prepare.R")
# source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/sctype_score_.R")
# db_ = "https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/ScTypeDB_full.xlsx"
path_to_db_file = db_
cell_type = "Immune system"
cell_markers = openxlsx::read.xlsx(path_to_db_file)
cell_markers = cell_markers[cell_markers$tissueType == cell_type,]
cell_markers$geneSymbolmore1 = gsub(" ","",cell_markers$geneSymbolmore1); cell_markers$geneSymbolmore2 = gsub(" ","",cell_markers$geneSymbolmore2)
tissue = "Immune system"
# prepare gene sets
suppressWarnings({
suppressMessages({
gs_list = gene_sets_prepare(db_, tissue)
})
})
The cells can be annotated as any of the following cell types:
intersect(names(gs_list$gs_positive), names(gs_list$gs_negative))
# get cell-type by cell matrix (scores each cell for each cell type)
es.max = sctype_score(scRNAseqData = abc.integrated@assays$integrated@scale.data,
scaled = TRUE,
gs = gs_list$gs_positive, gs2 = gs_list$gs_negative,
gene_names_to_uppercase = TRUE)
# merge by cluster by taking the sum of the scores of each cell type
cL_resutls = do.call("rbind", lapply(unique(md$seurat_clusters), function(cl){
es.max.cl = sort(rowSums(es.max[ ,rownames(md[md$seurat_clusters==cl, ])]), decreasing = !0)
head(data.frame(cluster = cl, type = names(es.max.cl), scores = es.max.cl, ncells = sum(md$seurat_clusters==cl)), 10)
}))
write.csv(cL_resutls,
paste0(data_path, 'interim/celltype_cluster_scores.csv'))
# take max score of each cluster
sctype_scores = cL_resutls %>% group_by(cluster) %>% top_n(n = 1, wt = scores)
sctype_scores[['freq']]<-100*sctype_scores$ncells/sum(sctype_scores$ncells)
sctype_scores[['Confidence']]<-'good'
sctype_scores$Confidence[as.numeric(as.character(sctype_scores$scores)) < sctype_scores$ncells/4] = "bad"
sctype_scores[with(sctype_scores, order(cluster)), ]
| cluster | type | scores | ncells | freq | Confidence |
|---|---|---|---|---|---|
| <fct> | <chr> | <dbl> | <int> | <dbl> | <chr> |
| 0 | Naive CD8+ T cells | 8409.80958 | 5556 | 17.5406472 | good |
| 1 | Pre-B cells | 12057.23958 | 3496 | 11.0370955 | good |
| 2 | Naive CD4+ T cells | 3909.46162 | 2579 | 8.1420679 | good |
| 3 | Memory CD8+ T cells | 5769.90904 | 2476 | 7.8168903 | good |
| 4 | Myeloid Dendritic cells | 4163.55192 | 2356 | 7.4380426 | good |
| 5 | Basophils | 1110.35901 | 2089 | 6.5951066 | good |
| 6 | Effector CD8+ T cells | 4112.45330 | 1645 | 5.1933702 | good |
| 7 | Macrophages | 3757.40204 | 1574 | 4.9692186 | good |
| 8 | Non-classical monocytes | 1206.32986 | 988 | 3.1191792 | good |
| 9 | Myeloid Dendritic cells | 2178.10713 | 967 | 3.0528808 | good |
| 10 | CD8+ NKT-like cells | 2463.14662 | 961 | 3.0339384 | good |
| 11 | Plasmacytoid Dendritic cells | 2303.04401 | 875 | 2.7624309 | good |
| 12 | Eosinophils | -75.54162 | 872 | 2.7529597 | bad |
| 13 | Natural killer cells | 91.08425 | 798 | 2.5193370 | bad |
| 14 | Myeloid Dendritic cells | 776.45560 | 727 | 2.2951855 | good |
| 15 | CD4+ NKT-like cells | 782.70961 | 549 | 1.7332281 | good |
| 16 | Progenitor cells | 1029.58665 | 478 | 1.5090766 | good |
| 17 | Macrophages | 2832.49822 | 477 | 1.5059195 | good |
| 18 | Myeloid Dendritic cells | 162.83592 | 472 | 1.4901342 | good |
| 19 | Natural killer cells | 2509.25371 | 334 | 1.0544594 | good |
| 20 | Pre-B cells | 920.31825 | 323 | 1.0197316 | good |
| 21 | Effector CD4+ T cells | 1353.22775 | 314 | 0.9913181 | good |
| 22 | Plasmacytoid Dendritic cells | 735.13280 | 310 | 0.9786898 | good |
| 23 | Neutrophils | 2067.02922 | 217 | 0.6850829 | good |
| 24 | γδ-T cells | 235.62306 | 134 | 0.4230466 | good |
| 25 | Naive B cells | 723.09840 | 69 | 0.2178374 | good |
| 26 | Pre-B cells | 117.54717 | 39 | 0.1231255 | good |
A cluster annotation has "low" confidence if the ScType score is < 0 or < 0.25*# of cells in that cluster. ScType annotates these as unknown, s.t. seemingly not enough information is available in the transcriptomics data and markers database. We will want to double check these and perhaps keep them labelled as Unknown.
unique(sctype_scores$type)
# assign cell types to barcodes
mapper<-setNames(sctype_scores$type, sctype_scores$cluster)
md[['Cell.Type.ScType']]<-unlist(unname(mapper[as.character(md$seurat_clusters)]))
co<-c('Progenitor cells', 'Naive B cells', 'Pre-B cells', 'γδ-T cells',
'Naive CD4+ T cells', 'Effector CD4+ T cells',
'Naive CD8+ T cells', 'Effector CD8+ T cells', 'Memory CD8+ T cells',
'CD4+ NKT-like cells', 'CD8+ NKT-like cells', 'Natural killer cells',
'Myeloid Dendritic cells', 'Plasmacytoid Dendritic cells',
'Non-classical monocytes', 'Macrophages',
'Basophils', 'Eosinophils', 'Neutrophils')
md[['Cell.Type.ScType']]<-factor(x = md$Cell.Type.ScType,levels = co)
# retain individual barcode scores
md<-cbind(md, t(es.max[, rownames(md)]))
abc.integrated@meta.data<-md
Let's see what the distribution of scores was for the annotations:
viz.df<-md[!colnames(md) %in% colnames(md)[c(1,2,3,4,5, 7)]]
viz.df<-melt(viz.df, value.name = 'Annotation.Score')
colnames(viz.df)<-c('seurat_clusters', 'Cell.Type', 'Annotation.Score')
h_ = 50
w_ = 30
options(repr.plot.height=h_, repr.plot.width=w_)
g1<-ggplot(data = viz.df, aes(x = Cell.Type, y = Annotation.Score)) + geom_violin() +
facet_wrap(~seurat_clusters, ncol = 2)+
theme(axis.text.x = element_text(angle = 45))
g1
Warning message in melt(viz.df, value.name = "Annotation.Score"): “The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(viz.df). In the next version, this warning will become an error.” Using seurat_clusters as id variables
# prepare edges
cL_resutls=cL_resutls[order(cL_resutls$cluster),]; edges = cL_resutls; edges$type = paste0(edges$type,"_",edges$cluster); edges$cluster = paste0("cluster ", edges$cluster); edges = edges[,c("cluster", "type")]; colnames(edges) = c("from", "to"); rownames(edges) <- NULL
# prepare nodes
nodes_lvl1 = sctype_scores[,c("cluster", "ncells")]; nodes_lvl1$cluster = paste0("cluster ", nodes_lvl1$cluster); nodes_lvl1$Colour = "#f1f1ef"; nodes_lvl1$ord = 1; nodes_lvl1$realname = nodes_lvl1$cluster; nodes_lvl1 = as.data.frame(nodes_lvl1); nodes_lvl2 = c();
ccolss= c("#5f75ae","#92bbb8","#64a841","#e5486e","#de8e06","#eccf5a","#b5aa0f","#e4b680","#7ba39d","#b15928","#ffff99", "#6a3d9a","#cab2d6","#ff7f00","#fdbf6f","#e31a1c","#fb9a99","#33a02c","#b2df8a","#1f78b4","#a6cee3")
for (i in 1:length(unique(cL_resutls$cluster))){
dt_tmp = cL_resutls[cL_resutls$cluster == unique(cL_resutls$cluster)[i], ]; nodes_lvl2 = rbind(nodes_lvl2, data.frame(cluster = paste0(dt_tmp$type,"_",dt_tmp$cluster), ncells = dt_tmp$scores, Colour = ccolss[i], ord = 2, realname = dt_tmp$type))
}
nodes = rbind(nodes_lvl1, nodes_lvl2); nodes$ncells[nodes$ncells<1] = 1;
files_db = openxlsx::read.xlsx(db_)[,c("cellName","shortName")]; files_db = unique(files_db); nodes = merge(nodes, files_db, all.x = T, all.y = F, by.x = "realname", by.y = "cellName", sort = F)
nodes$shortName[is.na(nodes$shortName)] = nodes$realname[is.na(nodes$shortName)]; nodes = nodes[,c("cluster", "ncells", "Colour", "ord", "shortName", "realname")]
mygraph <- graph_from_data_frame(edges, vertices=nodes)
# Make the graph
gggr<- ggraph(mygraph, layout = 'circlepack', weight=I(ncells)) +
geom_node_circle(aes(filter=ord==1,fill=I("#F5F5F5"), colour=I("#D3D3D3")), alpha=0.9) + geom_node_circle(aes(filter=ord==2,fill=I(Colour), colour=I("#D3D3D3")), alpha=0.9) +
theme_void() + geom_node_text(aes(filter=ord==2, label=shortName, colour=I("#ffffff"), fill="white", repel = !1, parse = T, size = I(log(ncells,25)*1.5)))+ geom_node_label(aes(filter=ord==1, label=shortName, colour=I("#000000"), size = I(3), fill="white", parse = T), repel = !0, segment.linetype="dotted")
h_ = 10
w_ = 10
options(repr.plot.height=h_, repr.plot.width=w_)
gggr
# scater::multiplot(DimPlot(pbmc, reduction = "umap", label = TRUE, repel = TRUE, cols = ccolss), gggr, cols = 2)
Non-leaf weights ignored Warning message: “Ignoring unknown aesthetics: fill, repel, parse” Warning message: “Ignoring unknown aesthetics: parse”
Let's look at distributions of scores for the cell type that was actually assigned to each cluster:
# filter for the score that was assigned
retain.idx<-c()
for (cluster in names(mapper)){
retain.idx<-c(retain.idx,
rownames(viz.df[(viz.df$seurat_clusters == cluster) & (viz.df$Cell.Type == mapper[cluster]), ]))
}
viz.df<-viz.df[retain.idx,]
h_ = 5
w_ = 15
options(repr.plot.height=h_, repr.plot.width=w_)
g2<-ggplot(data = viz.df, aes(x = seurat_clusters, y = Annotation.Score)) + geom_violin()
g2
h_ = 8
w_ = 16
options(repr.plot.height=h_, repr.plot.width=w_)
g3a <- DimPlot(abc.integrated, reduction = "umap", group.by = "seurat_clusters", shuffle = T, label = T)+
ggtitle('Cluster')
g3c <- DimPlot(abc.integrated, reduction = "umap", group.by = "Cell.Type.ScType", shuffle = T, label = F)+
ggtitle('Cell Type')
g3<-g3a + g3c
# for (ext in c('.svg', '.png', '.pdf')){ggsave(paste0(data_path, 'figures/processing/',
# 'UMAP_cluster_celltype', ext), g3)}
g3
Let's see whether the many-to-one mappings from cluster to cell type label are for clusters near each other in UMAP space or far apart. We will want to double check the ones that are far apart.
md<-abc.integrated@meta.data
md_og<-abc.integrated@meta.data
cluster.label<-'seurat_clusters'
unique.cell.cluster<-md[!duplicated(md[c(cluster.label, 'Cell.Type.ScType')]), ]
celltypes.multiple<-names(which(table(unique.cell.cluster$Cell.Type.ScType) > 1))
print('The following cells are labelled in more than one cluster: ')
celltypes.multiple
[1] "The following cells are labelled in more than one cluster: "
# label according to both
parse.cluster<-function(y, x){
if (y %in% celltypes.multiple){
return(paste0(y, ' - ', x))
}else{
return(as.character(y))
}
}
md[['Cell.Type.uniquecluster']]<-mapply(parse.cluster, md$Cell.Type.ScType, md[[cluster.label]])
co<-c('Progenitor cells', 'Naive B cells', 'Pre-B cells - 1', 'Pre-B cells - 20', 'Pre-B cells - 26', 'γδ-T cells',
'Naive CD4+ T cells', 'Effector CD4+ T cells',
'Naive CD8+ T cells', 'Effector CD8+ T cells', 'Memory CD8+ T cells',
'CD4+ NKT-like cells', 'CD8+ NKT-like cells', 'Natural killer cells - 19', 'Natural killer cells - 13',
'Myeloid Dendritic cells - 4', 'Myeloid Dendritic cells - 9', 'Myeloid Dendritic cells - 14',
'Myeloid Dendritic cells - 18',
'Plasmacytoid Dendritic cells - 11', 'Plasmacytoid Dendritic cells - 22',
'Non-classical monocytes', 'Macrophages - 7','Macrophages - 17',
'Basophils', 'Eosinophils', 'Neutrophils')
md[['Cell.Type.uniquecluster']]<-factor(x = md$Cell.Type.uniquecluster,levels = co)
mapper<-setNames(1:length((unique(md$Cell.Type.uniquecluster))), sort(unique(md$Cell.Type.uniquecluster)))
md[['Cell.Type.forUMAP']]<-unname(unlist(mapper[md[['Cell.Type.uniquecluster']]]))
md[['Cell.Type.forUMAP']]<-factor(x = md$Cell.Type.forUMAP,
levels = sort(unique(md[['Cell.Type.forUMAP']])))
legend.labels<-paste0(unname(unlist(mapper)), ': ', names(mapper))
abc.integrated@meta.data<-md
Annotation: "#: Cell Type - Cluster Label"
h_ = 7
w_ = 16
options(repr.plot.height=h_, repr.plot.width=w_)
colors = c(brewer.pal(n=12, "Set3"),
brewer.pal(n=8, "Dark2"),
brewer.pal(n=7, "Accent"))
theme = theme(panel.background = element_blank(), panel.border = element_rect(colour = 'black', fill = 'NA'),
text = element_text(size=28), panel.spacing = unit(1.15, "lines"),
axis.title=element_text(size=28), legend.text=element_text(size=16),
legend.title=element_text(size=20), axis.text.x = element_text(size = 22),
axis.text.y = element_text(size = 22))
g4<-DimPlot(abc.integrated, reduction = 'umap',
group.by = 'Cell.Type.forUMAP', label = T)+
xlab('UMAP 1')+ylab('UMAP 2')+
scale_color_manual(values = colors, labels = legend.labels) + labs(color='Cell Type')+theme+
ggtitle('Cell Types (Clusters distinguished)')
g4
Pre-b cells, myeloid DCs, and macrophages cluster together.
pDCs are near each other but not directly adjacent.
NK Cells do not cluster together. Note, one of the NK cell labels maps to cluster 13, which was a low confidence label from ScType.
h_ = 7
w_ = 16
options(repr.plot.height=h_, repr.plot.width=w_)
gQC1<-ggplot(data = abc.integrated@meta.data,
aes(y = percent.mt, x = seurat_clusters, fill = seurat_clusters))+geom_violin()+
scale_y_continuous(breaks = round(seq(0, max(md[['percent.mt']]), by = 5),1))+
theme_bw()
gQC1
h_ = 7
w_ = 16
options(repr.plot.height=h_, repr.plot.width=w_)
gQC2<-ggplot(data = abc.integrated@meta.data,
aes(y = nCount_RNA, x = seurat_clusters, fill = seurat_clusters))+geom_violin()+
scale_y_continuous(breaks = round(seq(0, max(md[['nCount_RNA']]), by = 1e4),1))+
theme_bw()
gQC2
Looking at the QC metrics by cluster, along witht the lack of annotation for clusters 12 and 13, we may consider dropping these ones.
abc.integrated@meta.data<-md_og
Dotplot of each marker to double check that annotated cell types make sense.
markers<-unname(unlist(gs_list$gs_positive))
print(paste0('The total number of unique markers is: ', length(markers)))
markers.present<-intersect(markers, toupper(rownames(abc.integrated)))
print(paste0('The total number of unique markers in the HVGs: ', length(markers.present)))
fwrite(as.list(markers.present), paste0(data_path, 'interim/scType_pos_markers.txt'))
[1] "The total number of unique markers is: 720" [1] "The total number of unique markers in the HVGs: 114"
markers.present<-rownames(abc.integrated)[toupper(rownames(abc.integrated)) %in% markers] # keep mouse gene name
h_ = 12
w_ = 45
options(repr.plot.height=h_, repr.plot.width=w_)
theme = theme(panel.background = element_blank(),
panel.border = element_rect(colour = 'black', fill = 'NA'),
text = element_text(size=28), panel.spacing = unit(1.15, "lines"),
axis.title=element_text(size=23), legend.text=element_text(size=16),
legend.title=element_text(size=22), axis.text.y = element_text(size = 18),
legend.key = element_blank(),
axis.text.x = element_text(size = 16))
suppressWarnings({
g<-DotPlot(abc.integrated, features = markers.present, cols = c('blue', 'red'), cluster.idents = T) +
RotatedAxis()+
xlab('Marker Gene') + ylab('Cluster') + theme + ggtitle("scType markers")
})
# for (ext in c('.svg', '.png', '.pdf')){ggsave(paste0(data_path, 'figures/processing/',
# 'scType_marker_dotplot', ext), g,
# height = h_, width = w_)}
g
Identify markers for each cluster to double check that annotated cell types make sense and re-annotate low-confidence cell types/those that were annotated across disparate clusters.
According to this study, there is not high concordance in methods, but Wilcoxon rank-sum is the most commonly used (and Seurat's default) and there is no need to account for confounders since we are considering all cells across batch.
# # Wilcoxon
# # normalized counts as recommended by
# # https://github.com/satijalab/seurat/issues/2636
# # https://github.com/satijalab/seurat/discussions/4000
# markers.wilcoxon<-FindAllMarkers(object = abc.integrated, assay = 'RNA', only.pos = T,
# slot = 'data', test.use = 'wilcox',
# min.pct = 0.25, # stringent since markers
# logfc.threshold = 0.5
# )
# saveRDS(markers.wilcoxon, paste0(data_path, 'interim/cluster_markers_wilcoxon.RDS'))
markers<-readRDS(paste0(data_path, 'interim/cluster_markers_wilcoxon.RDS'))
markers<-readRDS(paste0(data_path, 'interim/cluster_markers_wilcoxon.RDS'))
# format markers
markers.excel<-function(marker, de.type){
markers_workbook<-createWorkbook()
for (cluster in sort(unique(marker$cluster))){
de.res.cl<-as.data.frame(marker[marker$cluster == cluster, ])
rownames(de.res.cl)<-1:dim(de.res.cl)[[1]]
# # make infinites characlers, otherwise writes as NULL
# idx_<-as.numeric(rownames(de.res.cl[is.infinite(de.res.cl$LFC), ]))
addWorksheet(markers_workbook, cluster)
writeData(markers_workbook, sheet = cluster, x = de.res.cl)
}
saveWorkbook(markers_workbook, overwrite = T,
paste0(data_path, 'interim/', de.type, '_markers.xlsx'))
}
counter<-1
suppressMessages({
suppressWarnings({
de.type<-'wilcoxon'
markers<-markers[markers$p_val_adj <= 0.1,] # threshold on p_adj
markers<-markers[markers$avg_log2FC > 0.75,] # further threshold on LFC
markers<-markers[with(markers, order(cluster, -avg_log2FC)), ] # sort by effect size
markers[['scType.annotation']]<-unlist(unname(mapper[as.character(markers$cluster)]))
markers.excel(markers, de.type) # save to excel file
})
})
saveRDS(abc.integrated,
file = paste0(data_path, 'interim/abc_sctype_annotated.RDS'))